{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#-*-coding:utf8-*-\n",
    "\n",
    "\"\"\"\n",
    "author:YJM\n",
    "\n",
    "date:20190420\n",
    "\n",
    "util function\n",
    "\n",
    "\"\"\"\n",
    "import os\n",
    "import sys\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import operator"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 通过movies.csv获取电影信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_item_info(input_file):\n",
    "    if not os.path.exists(input_file):\n",
    "        return {}\n",
    "    item_info={}\n",
    "    linenum=0\n",
    "    fp = open(input_file)\n",
    "    for line in fp:\n",
    "        if linenum == 0:\n",
    "            linenum += 1\n",
    "            continue\n",
    "        item = line.strip().split(',')\n",
    "        if len(item)<3:\n",
    "            continue\n",
    "        elif len(item) == 3:\n",
    "            itemid,title,genre = item[0],item[1],item[2]\n",
    "        elif len(item)>3:\n",
    "            itemid = item[0]\n",
    "            genre = item[1]\n",
    "            title = ','.join(item[1:-1])\n",
    "        item_info[itemid]=[title,genre]\n",
    "    fp.closed\n",
    "    return item_info"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 通过ratings15000.csv获取平均分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_ave_score(input_file):\n",
    "    if not os.path.exists(input_file):\n",
    "        return {}\n",
    "    linenum = 0\n",
    "    record_dict = {}\n",
    "    score_dict = {}\n",
    "    fp = open(input_file)\n",
    "    for line in fp:\n",
    "        if linenum == 0:\n",
    "            linenum += 1\n",
    "            continue\n",
    "        item = line.strip().split(',')\n",
    "        if len(item)<4:\n",
    "            continue\n",
    "        userid,itemid,rating = item[0],item[1],float(item[2])\n",
    "        if itemid not in record_dict:\n",
    "            record_dict[itemid]=[0,0]\n",
    "        record_dict[itemid][0] += 1\n",
    "        record_dict[itemid][1] += rating\n",
    "    fp.closed\n",
    "    for itemid in record_dict:\n",
    "        score_dict[itemid] = round(record_dict[itemid][1]/record_dict[itemid][0],3)#小数点后保留3位有效数字\n",
    "    return score_dict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 获取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_train_data(input_file):\n",
    "    \"\"\"\n",
    "    :param input_file:\n",
    "    input_file:user item rating file\n",
    "    :return:\n",
    "    alist:[(userid,itemid,label),(userid,itemid,label)]\n",
    "    \"\"\"\n",
    "    if not os.path.exists(input_file):\n",
    "        return {}\n",
    "    score_dict = get_ave_score(input_file)#通过前面的方法获取平均分\n",
    "    neg_dict = {}#负样本集合\n",
    "    pos_dict = {}#正样本集合\n",
    "    train_data = []\n",
    "    linenum = 0\n",
    "    score_thr = 4#正负分界线：4分\n",
    "    fp = open(input_file)\n",
    "    for line in fp:\n",
    "        if linenum == 0:\n",
    "            linenum += 1\n",
    "            continue\n",
    "        item = line.strip().split(',')\n",
    "        if len(item)<4:\n",
    "            continue\n",
    "        userid,itemid,rating = item[0],item[1],float(item[2])\n",
    "        if userid not in pos_dict:\n",
    "            pos_dict[userid] = []\n",
    "        if userid not in neg_dict:\n",
    "            neg_dict[userid] = []\n",
    "        if rating >=score_thr:\n",
    "            pos_dict[userid].append((itemid,1))#这里正样本的格式是（userid，itemid，1）1代表正样本\n",
    "        else:\n",
    "            score = score_dict.get(itemid,0)#如果平均分没有获取到就设置成0\n",
    "            neg_dict[userid].append((itemid,score))#这里负样本的格式是（userid，itemid，平均得分）    \n",
    "    fp.closed\n",
    "    for userid in pos_dict:\n",
    "        data_num = min(len(pos_dict[userid]),len(neg_dict.get(userid,[])))\n",
    "        if data_num > 0:\n",
    "            train_data += [(userid,zuhe[0],zuhe[1]) for zuhe in pos_dict[userid]][:data_num]\n",
    "        else:\n",
    "            continue\n",
    "        #将负样本倒序排好并且取和正样本同样大小的样本集合\n",
    "        sorted_neg_list = sorted(neg_dict[userid],key=lambda element:element[1], reverse=True)[:data_num]\n",
    "        train_data += [(userid,zuhe[0],0)for zuhe in sorted_neg_list]#将最后一个值替换成0\n",
    "    return train_data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将lfm的模型进行训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lfm_train(train_data,F,alpha,beta,step):\n",
    "    user_vec = {}\n",
    "    item_vec = {}\n",
    "    for step_index in range(step):#迭代轮次\n",
    "        for data_instance in train_data:\n",
    "            userid,itemid,label = data_instance\n",
    "            if userid not in user_vec:\n",
    "                user_vec[userid] = init_model(F)\n",
    "            if itemid not in item_vec:\n",
    "                item_vec[itemid] = init_model(F)\n",
    "            delta = label - model_predict(user_vec[userid],item_vec[itemid])\n",
    "            for index in range(F):\n",
    "                user_vec[userid][index] += beta*(delta*item_vec[itemid][index]-alpha*user_vec[userid][index])\n",
    "                item_vec[itemid][index] += beta*(delta*user_vec[userid][index]-alpha*item_vec[itemid][index])\n",
    "            beta = beta*0.9      #学习率衰减\n",
    "    return user_vec,item_vec"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 初始化模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def init_model(vector_len):\n",
    "    \"\"\"\n",
    "        vector_len:the len of vector\n",
    "    \"\"\"\n",
    "    return np.random.randn(vector_len)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 计算用户向量和物品向量的余弦夹角"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_predict(user_vector,item_vector):\n",
    "    res = np.dot(user_vector,item_vector)/(np.linalg.norm(user_vector)*np.linalg.norm(item_vector))\n",
    "    return res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 模型训练函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_train_process():\n",
    "    train_data = get_train_data(\"../data/ratings15000.csv\")\n",
    "    user_vec,item_vec = lfm_train(train_data,50,0.01,0.1,50)\n",
    "    recom_result = give_recom_result(user_vec,item_vec,'24')\n",
    "    print recom_result\n",
    "    ana_recom_result(train_data,'24',recom_result)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 模型训练执行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "global name 'item_id' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-17-fe5397853777>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel_train_process\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-14-d343d4040edc>\u001b[0m in \u001b[0;36mmodel_train_process\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m     \u001b[0mtrain_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_train_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"../data/ratings15000.csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0muser_vec\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mitem_vec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlfm_train\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_data\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0.01\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0.1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m     \u001b[0mrecom_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgive_recom_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muser_vec\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mitem_vec\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'24'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m     \u001b[0;32mprint\u001b[0m \u001b[0mrecom_result\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m     \u001b[0mana_recom_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_data\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'24'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mrecom_result\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-16-246a4768b4c1>\u001b[0m in \u001b[0;36mgive_recom_result\u001b[0;34m(user_vec, item_vec, userid)\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0muser_vector\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0muser_vec\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0muserid\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mitemid\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mitem_vec\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m         \u001b[0mitem_vector\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mitem_vec\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mitem_id\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     10\u001b[0m         \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muser_vector\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mitem_vector\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinalg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnorm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muser_vector\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinalg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnorm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem_vector\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     11\u001b[0m         \u001b[0mrecord\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mitemid\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: global name 'item_id' is not defined"
     ]
    }
   ],
   "source": [
    "model_train_process()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 推荐过程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def give_recom_result(user_vec,item_vec,userid):\n",
    "    fix_num = 10\n",
    "    if userid not in user_vec:\n",
    "        return []\n",
    "    record = {}\n",
    "    recom_list = []\n",
    "    user_vector = user_vec[userid]\n",
    "    for itemid in item_vec:\n",
    "        item_vector = item_vec[itemid]\n",
    "        res = np.dot(user_vector,item_vector)/(np.linalg.norm(user_vector))*(np.linalg.norm(item_vector))\n",
    "        record[itemid] = res\n",
    "    for zuhe in sorted(record.iteritems(),key=operator.itemgetter(1),reverse=True)[:fix_num]:\n",
    "        itemid = zuhe[0]\n",
    "        score = round(zuhe[1],3)\n",
    "        recom_list.append((itemid,score))\n",
    "    return recom_list                                                                            "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 评估推荐结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ana_recom_result(train_data,userid,recom_list):\n",
    "    item_info = get_item_info(\"../data/movies.csv\")\n",
    "    for data_instance in train_data:\n",
    "        tmp_userid,itemid,label = data_instance\n",
    "        if label == 1 and tmp_userid == userid:\n",
    "            print item_info[itemid]\n",
    "        print \"recom result\"\n",
    "        for zuhe in recom_list:\n",
    "            print item_info[zuhe[0]]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
